library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)

theme_set(theme_minimal() + theme(legend.position = "bottom"))

options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)

scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

##visualizations of US map require install of sf and maps packages

import tidy dataset:

##note: should we export tidy dataset to our directory on github so we can delete this code from any EDA?

rawdata =
  read_csv("data/mental_health.csv")
## Rows: 10404 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Indicator, Group, State, Subgroup, Phase, Time Period Label, Time ...
## dbl  (5): Time Period, Value, LowCI, HighCI, Suppression Flag
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidydata = 
  rawdata |> 
  janitor::clean_names() |>
  mutate(indicator = str_replace(indicator, ", Last 4 Weeks", "")) |> 
  select(-phase, -quartile_range, -suppression_flag) |> 
  mutate(group = str_replace(group, "By ", "")) |>
  drop_na(value) |>
  select(start_date = time_period_start_date, end_date = time_period_end_date, everything())

tidydata =
  tidydata |>
  mutate(week_number = match(time_period_label, unique(time_period_label))) |>
  mutate(
    start_dates = mdy(pull(tidydata, start_date)),
    end_dates = mdy(pull(tidydata, end_date))) |> 
  mutate(tpl = time_period_label) |>  
  mutate(year = as.numeric(str_extract(tpl, "\\d{4}")))|> 
  separate(col = time_period_label, into = c("week_label", "years"), sep = ", ", remove = FALSE, extra = "merge") |>
  select(indicator, year, start_dates, end_dates, week_number, week_label = time_period_label, state, group, subgroup, value, low_ci, high_ci, confidence_interval)

EDA

This EDA will analyze mental health care access trends by state.

I will first create a dataframe that only includes state-level observations.

state_df =
  tidydata |> 
  filter(group == "State")

Indicator 1: Took prescription

Let’s start with the mental health indicator “Took Prescription Medication for Mental Health”

The following chunk shows the top five states with the highest mean percentage of respondents who reported taking prescription medication for mental health from 2020-2022.

state_meds_top5 =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(desc(mean_value)) |> 
  top_n(5) 
## Selecting by mean_value
state_meds_top5 |> 
  knitr::kable()
state mean_value
West Virginia 28.84848
Kentucky 27.37879
Arkansas 26.69091
Utah 25.75455
Oklahoma 25.70909

Let’s visualize a heat map of this indicator’s prevalence by US state.

First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.

state_meds =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  mutate(state = tolower(state))

usa = 
  st_as_sf(map("state", fill = TRUE, plot = FALSE))

us_meds =
  merge(usa, state_meds, by.x = "ID", by.y = "state", all.x = TRUE)

Next, we’ll plot the data using ggplot

heatmap_meds=
ggplot(us_meds)+
  geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
  scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % taking prescription medications")+
  theme_minimal()+
  theme(legend.position = "right",
        plot.title.position = "plot")+
  labs(title = "Average % used prescription medication for mental health 2020-2022, by state")

print(heatmap_meds)

We can also look at which states have had the greatest change in medication use. This shows that not only does WV have the highest prevalence of medication use for mental health during this period, but it also has had the highest level of change in this indicator between 2020-2022.

state_meds_change =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health", year != "2021") |> 
  group_by(state, year) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(state, year) |> 
  mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_meds_change |> 
  filter(!is.na(val_change)) |> 
  ungroup() |> 
  select(-year, -mean_value) |> 
  slice_max(order_by = val_change, n=5) |> 
  knitr::kable()
state val_change
West Virginia 7.044444
Nebraska 5.219444
North Dakota 5.136111
Minnesota 4.502778
Kentucky 4.452778

Let’s take a look at the trend for this indicator over time.

state_df |>
  filter(indicator == "Took Prescription Medication for Mental Health") |> 
  group_by(state, year) |>
  summarize(mean_value = mean(value)) |> 
  mutate(year = factor(year)) |> 
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
  layout(
    title = "% used prescription medication for mental health by state & year",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average % used prescription medication for mental health"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

Indicator 2: Received Counseling or Therapy

Now we’ll look at the mental health indicator “Received Counseling or Therapy”

The following chunk shows the top five states with the highest mean percentage of respondents who reported receiving counseling or therapy from 2020-2022. The state with the highest percentage during this period was District of Columbia.

state_ther_top5 =
  state_df |> 
  filter(indicator == "Received Counseling or Therapy") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(desc(mean_value)) |> 
  top_n(5) 
## Selecting by mean_value
state_ther_top5 |> 
  knitr::kable()
state mean_value
District of Columbia 18.08182
Massachusetts 14.92727
Rhode Island 14.24848
Vermont 13.72121
Oregon 12.97273

Let’s visualize a heat map of this indicator’s prevalence by US state.

First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.

state_ther =
  state_df |> 
  filter(indicator == "Received Counseling or Therapy") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  mutate(state = tolower(state))

usa = 
  st_as_sf(map("state", fill = TRUE, plot = FALSE))

us_ther =
  merge(usa, state_ther, by.x = "ID", by.y = "state", all.x = TRUE)

Next, we’ll plot the data using ggplot

heatmap_ther=
ggplot(us_ther)+
  geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
  scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % receiving therapy")+
  theme_minimal()+
  theme(legend.position = "right",
        plot.title.position = "plot")+
  labs(title = "Average % receiving counseling or therapy 2020-2022, by state")

print(heatmap_ther)

We can also look at which states have had the greatest change in therapy use. This shows that NE had the highest level of change in this indicator between 2020-2022. It also had a high level of change in indicator 1

state_ther_change =
  state_df |> 
  filter(indicator == "Received Counseling or Therapy", year != "2021") |> 
  group_by(state, year) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(state, year) |> 
  mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_ther_change |> 
  filter(!is.na(val_change)) |> 
  ungroup() |> 
  select(-year, -mean_value) |> 
  slice_max(order_by = val_change, n=5) |> 
  knitr::kable()
state val_change
Nebraska 4.369444
North Dakota 3.922222
Montana 3.694444
Oklahoma 3.091667
Delaware 2.661111

Let’s take a look at the trend for this indicator over time.

state_df |>
  filter(indicator == "Received Counseling or Therapy") |> 
  group_by(state, year) |>
  summarize(mean_value = mean(value)) |> 
  mutate(year = factor(year)) |> 
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
  layout(
    title = "% received counseling or therapy by state & year",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average % received therapy"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

Indicator 3: “Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy”

The third indicator we’ll look at combines the first two indicators into one category. This can help give a snapshot of overall mental health care need.

The following chunk shows the top five states with the highest mean percentage of respondents who reported mental health medication and/or therapy use from 2020-2022. The state with the highest percentage during this period was West Virginia.

state_both_top5 =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(desc(mean_value)) |> 
  top_n(5) 
## Selecting by mean_value
state_both_top5 |> 
  knitr::kable()
state mean_value
West Virginia 30.74242
Utah 29.82424
Kentucky 29.72121
Vermont 29.68182
Maine 29.63333

Let’s visualize a heat map of this indicator’s prevalence by US state.

First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.

state_both =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  mutate(state = tolower(state))

usa = 
  st_as_sf(map("state", fill = TRUE, plot = FALSE))

us_both =
  merge(usa, state_both, by.x = "ID", by.y = "state", all.x = TRUE)

Next, we’ll plot the data using ggplot

heatmap_both=
ggplot(us_both)+
  geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
  scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % medication and/or therapy use")+
  theme_minimal()+
  theme(legend.position = "right",
        plot.title.position = "plot")+
  labs(title = "Average % using medication and/or therapy 2020-2022, by state")

print(heatmap_both)

We can also look at which states have had the greatest change in mental health care use. This shows that WV had the highest level of change in this indicator between 2020-2022. This makes sense, as WV had the highest change in medication use.

state_both_change =
  state_df |> 
  filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy", year != "2021") |> 
  group_by(state, year) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(state, year) |> 
  mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_both_change |> 
  filter(!is.na(val_change)) |> 
  ungroup() |> 
  select(-year, -mean_value) |> 
  slice_max(order_by = val_change, n=5) |> 
  knitr::kable()
state val_change
West Virginia 7.155556
North Dakota 6.500000
Nebraska 5.747222
Minnesota 4.850000
Oklahoma 4.736111

Let’s take a look at the trend for this indicator over time.

state_df |>
  filter(indicator == "Took Prescription Medication for Mental Health And/Or Received Counseling or Therapy") |> 
  group_by(state, year) |>
  summarize(mean_value = mean(value)) |> 
  mutate(year = factor(year)) |> 
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
  layout(
    title = "% medication and/or therapy use by state & year",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average % medication and/or therapy use"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.

Indicator 4: Needed Counseling or Therapy But Did Not Get It

The last indicator we’ll look at is describes gaps in mental health care access, specifically counseling and therapy.

The following chunk shows the top five states with the highest mean percentage of respondents who reported no receipt of needed counseling/therapy from 2020-2022. The state with the highest percentage during this period was Oregon. Surprisingly, District of Columbia also appeared on the top 5, even though it was the “state” with the highest level of therapy use during this period.

state_need_top5 =
  state_df |> 
  filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(desc(mean_value)) |> 
  top_n(5) 
## Selecting by mean_value
state_need_top5 |> 
  knitr::kable()
state mean_value
Oregon 14.43939
Utah 13.77879
District of Columbia 13.67273
Colorado 13.02727
Washington 12.89697

Let’s visualize a heat map of this indicator’s prevalence by US state.

First, we’ll merge the filtered dataset with an existing US state map dataset to assist with mapping.

state_need =
  state_df |> 
  filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |> 
  group_by(state) |> 
  summarize(mean_value = mean(value)) |> 
  mutate(state = tolower(state))

usa = 
  st_as_sf(map("state", fill = TRUE, plot = FALSE))

us_need =
  merge(usa, state_need, by.x = "ID", by.y = "state", all.x = TRUE)

Next, we’ll plot the data using ggplot

heatmap_need=
ggplot(us_need)+
  geom_sf(aes(fill = mean_value), color = "white", size = 0.2)+
  scale_fill_gradient(low = "white", high = "red", na.value = "grey50", name = "Avg % did not receive needed therapy")+
  theme_minimal()+
  theme(legend.position = "right",
        plot.title.position = "plot")+
  labs(title = "Average % did not receive needed therapy 2020-2022, by state")

print(heatmap_need)

We can also look at which states have had the greatest change in unmet mental health care need. This shows that AK had the highest level of change in this indicator between 2020-2022.

state_need_change =
  state_df |> 
  filter(indicator == "Needed Counseling or Therapy But Did Not Get It", year != "2021") |> 
  group_by(state, year) |> 
  summarize(mean_value = mean(value)) |> 
  arrange(state, year) |> 
  mutate(val_change = c(NA, diff(mean_value)))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.
state_need_change |> 
  filter(!is.na(val_change)) |> 
  ungroup() |> 
  select(-year, -mean_value) |> 
  slice_max(order_by = val_change, n=5) |> 
  knitr::kable()
state val_change
Arkansas 3.127778
Wyoming 2.694444
South Dakota 2.538889
Texas 2.363889
Wisconsin 2.308333

Let’s take a look at the trend for this indicator over time.

state_df |>
  filter(indicator == "Needed Counseling or Therapy But Did Not Get It") |> 
  group_by(state, year) |>
  summarize(mean_value = mean(value)) |> 
  mutate(year = factor(year)) |> 
plot_ly(x = ~year, y = ~mean_value, color = ~state, type = "scatter", mode = "lines+markers") |>
  layout(
    title = "% did not receive needed therapy by state & year",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Avg % did not receive needed therapy"))
## `summarise()` has grouped output by 'state'. You can override using the
## `.groups` argument.